import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc,precision_recall_curve, accuracy_score, \
recall_score, precision_score, confusion_matrix
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier, GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
import time as time
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from catboost import CatBoostClassifier
#from pyearth import EarthXGBoost Modeling
Team Zzz
wine = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\winequality-white.csv', sep=';')wine| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.00100 | 3.00 | 0.45 | 8.8 | 6 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.99400 | 3.30 | 0.49 | 9.5 | 6 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.99510 | 3.26 | 0.44 | 10.1 | 6 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.99560 | 3.19 | 0.40 | 9.9 | 6 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.99560 | 3.19 | 0.40 | 9.9 | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4893 | 6.2 | 0.21 | 0.29 | 1.6 | 0.039 | 24.0 | 92.0 | 0.99114 | 3.27 | 0.50 | 11.2 | 6 |
| 4894 | 6.6 | 0.32 | 0.36 | 8.0 | 0.047 | 57.0 | 168.0 | 0.99490 | 3.15 | 0.46 | 9.6 | 5 |
| 4895 | 6.5 | 0.24 | 0.19 | 1.2 | 0.041 | 30.0 | 111.0 | 0.99254 | 2.99 | 0.46 | 9.4 | 6 |
| 4896 | 5.5 | 0.29 | 0.30 | 1.1 | 0.022 | 20.0 | 110.0 | 0.98869 | 3.34 | 0.38 | 12.8 | 7 |
| 4897 | 6.0 | 0.21 | 0.38 | 0.8 | 0.020 | 22.0 | 98.0 | 0.98941 | 3.26 | 0.32 | 11.8 | 6 |
4898 rows × 12 columns
0.1 Checking Distribution of Response
sns.histplot(data=wine, x='quality')<AxesSubplot:xlabel='quality', ylabel='Count'>
The distribution is normal, with the highest number of wine in the category of 6. This might be problematic, as the data will less accurately be able to predict variables that are in categories other than 6. In order to take care of this, I will be using ensemble models, which have an advantage when dealing with imbalanced datasets, since they assign higher weights to the misclassified samples or minority class samples.
The F1 score is especially valuable when dealing with imbalanced datasets, where one class is more dominant than the other. In such cases, accuracy alone may be misleading because the model could achieve high accuracy by simply predicting the dominant class. The F1 score considers both true positives and false negatives, giving a more accurate evaluation of the model’s performance in such scenarios.
sns.pairplot(wine)We can see some outliers in the residual response, citric acid, density, and fixed acidity (against quality) plots. I will continue to investigate whether these outliers all stem from a single point (in that case it might be interesting to inspect it’s effect on the data, as it could be that that outlier is a single faulty observation).
0.1.0.1 Checking for whether the outlier is a single point
# Step 1: Identify outliers for each predictor
outliers = {} # Dictionary to store outliers for each predictor
for column in wine.columns:
# Calculate z-scores for each column
z_scores = np.abs((wine[column] - wine[column].mean()) / wine[column].std())
# Define a threshold for outlier detection (e.g., z-score > 3)
threshold = 3
# Identify outliers based on the threshold
outliers[column] = wine[z_scores > threshold]
# Step 2: Combine outliers from all predictors
combined_outliers = pd.concat(outliers.values()).drop_duplicates()
# Step 3: Determine common outliers
common_outliers = combined_outliers[combined_outliers.duplicated(keep=False)]
# Display the common outliers
print("Common Outliers:")
print(common_outliers)Common Outliers:
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
There are no common outliers - the outliers are different observations for different predictors.
0.1.1 Checking the distribution of the variables from for individual predictors
import matplotlib.pyplot as plt
import seaborn as sns
selected_features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides',
'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
# Calculate the number of rows and columns based on the total number of features
num_features = len(selected_features)
num_rows = int(num_features ** 0.5)
num_cols = (num_features + num_rows - 1) // num_rows
# Create a grid of subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 15))
# Iterate over the selected features and plot each density plot on a different subplot
for i, feature in enumerate(selected_features):
row = i // num_cols
col = i % num_cols
ax = axes[row, col] if num_rows > 1 else axes[col]
sns.distplot(wine[feature], ax=ax, kde=False, hist_kws={'edgecolor': 'black'})
ax.set_title(feature)
ax.set_xlabel('')
ax.set_ylabel('Density')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.show()We can see here that the scale for the different variables is quite varied. In order to take this into account, we standardized our data before training the model.
Moreover, due to the skewed nature of some of the variables, we also experimented with log transforming some of the especially skewed data when training models (specifically, residual sugar).
corr = wine.corr()
corr| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1.000000 | -0.022697 | 0.289181 | 0.089021 | 0.023086 | -0.049396 | 0.091070 | 0.265331 | -0.425858 | -0.017143 | -0.120881 | -0.113663 |
| volatile acidity | -0.022697 | 1.000000 | -0.149472 | 0.064286 | 0.070512 | -0.097012 | 0.089261 | 0.027114 | -0.031915 | -0.035728 | 0.067718 | -0.194723 |
| citric acid | 0.289181 | -0.149472 | 1.000000 | 0.094212 | 0.114364 | 0.094077 | 0.121131 | 0.149503 | -0.163748 | 0.062331 | -0.075729 | -0.009209 |
| residual sugar | 0.089021 | 0.064286 | 0.094212 | 1.000000 | 0.088685 | 0.299098 | 0.401439 | 0.838966 | -0.194133 | -0.026664 | -0.450631 | -0.097577 |
| chlorides | 0.023086 | 0.070512 | 0.114364 | 0.088685 | 1.000000 | 0.101392 | 0.198910 | 0.257211 | -0.090439 | 0.016763 | -0.360189 | -0.209934 |
| free sulfur dioxide | -0.049396 | -0.097012 | 0.094077 | 0.299098 | 0.101392 | 1.000000 | 0.615501 | 0.294210 | -0.000618 | 0.059217 | -0.250104 | 0.008158 |
| total sulfur dioxide | 0.091070 | 0.089261 | 0.121131 | 0.401439 | 0.198910 | 0.615501 | 1.000000 | 0.529881 | 0.002321 | 0.134562 | -0.448892 | -0.174737 |
| density | 0.265331 | 0.027114 | 0.149503 | 0.838966 | 0.257211 | 0.294210 | 0.529881 | 1.000000 | -0.093591 | 0.074493 | -0.780138 | -0.307123 |
| pH | -0.425858 | -0.031915 | -0.163748 | -0.194133 | -0.090439 | -0.000618 | 0.002321 | -0.093591 | 1.000000 | 0.155951 | 0.121432 | 0.099427 |
| sulphates | -0.017143 | -0.035728 | 0.062331 | -0.026664 | 0.016763 | 0.059217 | 0.134562 | 0.074493 | 0.155951 | 1.000000 | -0.017433 | 0.053678 |
| alcohol | -0.120881 | 0.067718 | -0.075729 | -0.450631 | -0.360189 | -0.250104 | -0.448892 | -0.780138 | 0.121432 | -0.017433 | 1.000000 | 0.435575 |
| quality | -0.113663 | -0.194723 | -0.009209 | -0.097577 | -0.209934 | 0.008158 | -0.174737 | -0.307123 | 0.099427 | 0.053678 | 0.435575 | 1.000000 |
corr > 0.5| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed acidity | True | False | False | False | False | False | False | False | False | False | False | False |
| volatile acidity | False | True | False | False | False | False | False | False | False | False | False | False |
| citric acid | False | False | True | False | False | False | False | False | False | False | False | False |
| residual sugar | False | False | False | True | False | False | False | True | False | False | False | False |
| chlorides | False | False | False | False | True | False | False | False | False | False | False | False |
| free sulfur dioxide | False | False | False | False | False | True | True | False | False | False | False | False |
| total sulfur dioxide | False | False | False | False | False | True | True | True | False | False | False | False |
| density | False | False | False | True | False | False | True | True | False | False | False | False |
| pH | False | False | False | False | False | False | False | False | True | False | False | False |
| sulphates | False | False | False | False | False | False | False | False | False | True | False | False |
| alcohol | False | False | False | False | False | False | False | False | False | False | True | False |
| quality | False | False | False | False | False | False | False | False | False | False | False | True |
0.1.1.0.0.1 Some observations:
It seems like residual sugar and density are highly correlated, as well as density and total sulfur dioxide.
Free sulfur dioxide is highly correlated with total sulfur dioxide.
0.1.1.0.0.2 Dealing with Outliers:
XGBoost is a great algorithm to deal with outliers since it incorporates features that reduce the effect of the outliers on the fitting of the model. For example, it uses regularization parameters including gamma, alpha, and lampba which help prevent individual trees from being overly influenced by outliers. We can also use the learning rate and early stopping to prevent the influence of outliers by overfitting the data
0.1.1.1 Checking for null values
wine.isnull().sum().sum()0
There are no null values.
0.2 Making Categorical Vairables
wine = pd.concat([wine.drop('quality', axis = 1), pd.get_dummies(wine['quality'])], axis = 1)wine| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.00100 | 3.00 | 0.45 | 8.8 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.99400 | 3.30 | 0.49 | 9.5 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.99510 | 3.26 | 0.44 | 10.1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.99560 | 3.19 | 0.40 | 9.9 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.99560 | 3.19 | 0.40 | 9.9 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4893 | 6.2 | 0.21 | 0.29 | 1.6 | 0.039 | 24.0 | 92.0 | 0.99114 | 3.27 | 0.50 | 11.2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4894 | 6.6 | 0.32 | 0.36 | 8.0 | 0.047 | 57.0 | 168.0 | 0.99490 | 3.15 | 0.46 | 9.6 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4895 | 6.5 | 0.24 | 0.19 | 1.2 | 0.041 | 30.0 | 111.0 | 0.99254 | 2.99 | 0.46 | 9.4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4896 | 5.5 | 0.29 | 0.30 | 1.1 | 0.022 | 20.0 | 110.0 | 0.98869 | 3.34 | 0.38 | 12.8 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4897 | 6.0 | 0.21 | 0.38 | 0.8 | 0.020 | 22.0 | 98.0 | 0.98941 | 3.26 | 0.32 | 11.8 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4898 rows × 18 columns
# Rename all columns using a list of new names
wine.columns = wine.columns.astype(str)wine| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.00100 | 3.00 | 0.45 | 8.8 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.99400 | 3.30 | 0.49 | 9.5 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.99510 | 3.26 | 0.44 | 10.1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.99560 | 3.19 | 0.40 | 9.9 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.99560 | 3.19 | 0.40 | 9.9 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4893 | 6.2 | 0.21 | 0.29 | 1.6 | 0.039 | 24.0 | 92.0 | 0.99114 | 3.27 | 0.50 | 11.2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4894 | 6.6 | 0.32 | 0.36 | 8.0 | 0.047 | 57.0 | 168.0 | 0.99490 | 3.15 | 0.46 | 9.6 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4895 | 6.5 | 0.24 | 0.19 | 1.2 | 0.041 | 30.0 | 111.0 | 0.99254 | 2.99 | 0.46 | 9.4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4896 | 5.5 | 0.29 | 0.30 | 1.1 | 0.022 | 20.0 | 110.0 | 0.98869 | 3.34 | 0.38 | 12.8 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4897 | 6.0 | 0.21 | 0.38 | 0.8 | 0.020 | 22.0 | 98.0 | 0.98941 | 3.26 | 0.32 | 11.8 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
4898 rows × 18 columns
X_train = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\X_train.csv', sep=',')
X_test = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\X_test.csv', sep=',')
y_train = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\y_train.csv', sep=',')
y_test = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\y_test.csv', sep=',')0.3 XG Boost Model
In order to take class imbalance into account, I will be implementing an algorithm that balances the class weights of the individual classes in the XGBoost model.
First the different classes need to be binarized for the model to evaluate it as multiclass response variables.
# mapping the variables from 0 to 7 (the model does not accept the standard format)
num = {3: 0,
4: 1,
5: 2,
6: 3,
7: 4,
8: 5,
9: 6}
y_train = y_train['quality'].map(num)
y_test = y_test['quality'].map(num)# taking weights into account using an algorithm that rebalances the weights
def CreateBalancedSampleWeights(y_train, largest_class_weight_coef):
# taking the unique classes
classes = np.unique(y_train, axis = 0)
# sorting the classes depending on the number of observations in ascending order
classes.sort()
#This line uses the np.bincount() function to count the number of occurrences of each class in the y_train array. The resulting counts are stored in the class_samples array.
class_samples = np.bincount(y_train)
# The total number of samples in y_train is calculated by summing up all the counts in class_samples.
total_samples = class_samples.sum()
# The variable n_classes stores the number of unique classes found in y_train.
n_classes = len(class_samples)
# This line calculates the weight for each class by dividing the total number of samples by the product of the number of classes, the class samples, and 1.0 (used to ensure floating-point division).
weights = total_samples / (n_classes * class_samples * 1.0)
# The class_weight_dict dictionary is created, mapping each class to its corresponding weight. This is done by pairing up the classes array with the weights array using the zip() function.
class_weight_dict = {key : value for (key, value) in zip(classes, weights)}
# The weight of the second class (index 1 in classes) is multiplied by largest_class_weight_coef and updated in the class_weight_dict.
class_weight_dict[classes[1]] = class_weight_dict[classes[1]] * largest_class_weight_coef
# This line creates a list called sample_weights by iterating over each value y in y_train and retrieving the corresponding weight from class_weight_dict.
sample_weights = [class_weight_dict[y] for y in y_train]
return sample_weights# identifying the largert class coefficient
largest_class_weight_coef = max(y_train.value_counts().values)/y_train.shape[0]
#pass y_train as numpy array
weight = CreateBalancedSampleWeights(y_train, largest_class_weight_coef)largest_class_weight_coef = max(y_train.value_counts().values)/y_train.shape[0]
largest_class_weight_coef0.4462219196732471
classes = np.unique(y_train, axis = 0)
# sorting the classes depending on the number of observations in ascending order
classes.sort()
#This line uses the np.bincount() function to count the number of occurrences of each class in the y_train array. The resulting counts are stored in the class_samples array.
class_samples = np.bincount(y_train)
# The total number of samples in y_train is calculated by summing up all the counts in class_samples.
total_samples = class_samples.sum()
# The variable n_classes stores the number of unique classes found in y_train.
n_classes = len(class_samples)
# This line calculates the weight for each class by dividing the total number of samples by the product of the number of classes, the class samples, and 1.0 (used to ensure floating-point division).
weights = total_samples / (n_classes * class_samples * 1.0)
class_weight_dict = {key : value for (key, value) in zip(classes, weights)}
class_weight_dict{0: 34.976190476190474,
1: 3.8155844155844156,
2: 0.4774906549650577,
3: 0.32014819657840254,
4: 0.8181565023670286,
5: 3.886243386243386,
6: 83.94285714285714}
class_samplesarray([ 12, 110, 879, 1311, 513, 108, 5], dtype=int64)
class_weight_dict[classes[1]] = class_weight_dict[classes[1]] * largest_class_weight_coef
class_weight_dict[classes[1]]1.7025974025974027
sample_weights = [class_weight_dict[y] for y in y_train]
sample_weights[0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
3.886243386243386,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
1.7025974025974027,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
3.886243386243386,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
1.7025974025974027,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
1.7025974025974027,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
1.7025974025974027,
0.4774906549650577,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
34.976190476190474,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
34.976190476190474,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
3.886243386243386,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
1.7025974025974027,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
1.7025974025974027,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
34.976190476190474,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
1.7025974025974027,
0.4774906549650577,
3.886243386243386,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
3.886243386243386,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.8181565023670286,
0.4774906549650577,
1.7025974025974027,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
3.886243386243386,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
34.976190476190474,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
...]
len(sample_weights)2938
weight[0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
3.886243386243386,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
1.7025974025974027,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
3.886243386243386,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
1.7025974025974027,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
1.7025974025974027,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
1.7025974025974027,
0.4774906549650577,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
34.976190476190474,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
34.976190476190474,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
3.886243386243386,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
1.7025974025974027,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
1.7025974025974027,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
34.976190476190474,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
1.7025974025974027,
0.4774906549650577,
3.886243386243386,
1.7025974025974027,
0.8181565023670286,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
3.886243386243386,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.8181565023670286,
0.4774906549650577,
1.7025974025974027,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
3.886243386243386,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
3.886243386243386,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
3.886243386243386,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
1.7025974025974027,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.4774906549650577,
0.4774906549650577,
0.32014819657840254,
0.8181565023670286,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
0.32014819657840254,
34.976190476190474,
0.4774906549650577,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
1.7025974025974027,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
3.886243386243386,
0.32014819657840254,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.8181565023670286,
0.8181565023670286,
0.32014819657840254,
0.32014819657840254,
0.4774906549650577,
0.32014819657840254,
...]
y_train0 2
1 3
2 2
3 3
4 3
..
2933 3
2934 3
2935 3
2936 3
2937 5
Name: quality, Length: 2938, dtype: int64
The algorithm adjusts the weight of each variable accordingly.
0.3.1 Base Model
# Create XGBoost classifier with the custom evaluation metric
model = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
seed=45)
model.fit(X_train, y_train, sample_weight = weight)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False,
eval_metric=<function f1_score at 0x00000295BED9F3A0>,
feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=None, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_class=7, num_parallel_tree=None, objective='multi:softmax', ...)
pd.concat([pd.Series(X_train.columns, name = 'predictor'),
pd.Series(model.feature_importances_,
name = 'importance')], axis = 1).sort_values(by = 'importance', ascending=False)| predictor | importance | |
|---|---|---|
| 0 | fixed acidity | 0.146282 |
| 10 | alcohol | 0.141838 |
| 5 | free sulfur dioxide | 0.116943 |
| 7 | density | 0.095891 |
| 8 | pH | 0.088791 |
| 1 | volatile acidity | 0.086287 |
| 3 | residual sugar | 0.084754 |
| 4 | chlorides | 0.074907 |
| 6 | total sulfur dioxide | 0.060982 |
| 9 | sulphates | 0.053796 |
| 2 | citric acid | 0.049529 |
model.score(X_test, y_test)0.6433673469387755
The F1 score for the base model is of 64.3 %.
1 Tuning of Parameters
Multiclass:soft is being used as the obsejective as we are looking at multiple classes. I also specified the num_class (7). Eval metric set to f1_weighted for the cross_val_score argument.
1.0.1 Number of Trees
1.0.1.1 Coarse
def get_models():
models = dict()
# define number of trees to consider
n_trees = [5, 10, 50, 100, 500, 1000, 2000, 5000]
for n in n_trees:
models[str(n)] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
n_estimators=n,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Number of trees',fontsize=15)>5 0.450 (0.015)
>10 0.492 (0.012)
>50 0.577 (0.011)
>100 0.591 (0.005)
>500 0.590 (0.011)
>1000 0.593 (0.010)
>2000 0.592 (0.008)
>5000 0.591 (0.008)
Text(0.5, 0, 'Number of trees')
1.0.1.2 Fine
def get_models():
models = dict()
# define number of trees to consider
n_trees = [60, 80, 100, 150, 200]
for n in n_trees:
models[str(n)] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
n_estimators=n,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Number of trees',fontsize=15)>60 0.585 (0.011)
>80 0.591 (0.006)
>100 0.591 (0.005)
>150 0.594 (0.008)
>200 0.589 (0.010)
Text(0.5, 0, 'Number of trees')
An ideal range for the number of trees to be used in the model is around [150].
1.1 Depth
Max depth is used to control how many layers there are to the tree. It helps avoid overfitting by having too deep or a tree and underfitting by having too shallow of a tree.
1.1.0.0.0.1 Coarse
# get a list of models to evaluate
def get_models():
models = dict()
# explore depths from 1 to 10
depth = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
for n in depth:
models[str(n)] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
max_depth=n,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Number of trees',fontsize=15)>3 0.532 (0.016)
>4 0.557 (0.005)
>5 0.583 (0.011)
>6 0.591 (0.005)
>7 0.597 (0.011)
>8 0.595 (0.008)
>9 0.598 (0.016)
>10 0.603 (0.014)
>11 0.598 (0.008)
>12 0.605 (0.015)
>13 0.604 (0.009)
Text(0.5, 0, 'Number of trees')
1.1.0.0.1 Fine
# get a list of models to evaluate
def get_models():
models = dict()
# explore depths from 1 to 10
depth = [6, 7, 8, 9, 10]
for n in depth:
models[str(n)] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
max_depth=n,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Number of trees',fontsize=15)>6 0.591 (0.005)
>7 0.597 (0.011)
>8 0.595 (0.008)
>9 0.598 (0.016)
>10 0.603 (0.014)
Text(0.5, 0, 'Number of trees')
The ideal range of the depth of trees that seems ideal is [7, 9, 10].
1.2 Learning Rate
# get a list of models to evaluate
def get_models():
models = dict()
for i in [0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.8,1.0]:
key = '%.4f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
learning_rate= i,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Learning Rate',fontsize=15)>0.0100 0.430 (0.011)
>0.0500 0.515 (0.012)
>0.1000 0.557 (0.016)
>0.2000 0.586 (0.005)
>0.3000 0.591 (0.005)
>0.4000 0.590 (0.016)
>0.5000 0.592 (0.014)
>0.6000 0.593 (0.016)
>0.8000 0.589 (0.016)
>1.0000 0.591 (0.012)
Text(0.5, 0, 'Learning Rate')
1.2.0.0.0.1 Fine
# get a list of models to evaluate
def get_models():
models = dict()
for i in [0.1, 0.15, 0.2, 0.25,0.3, 0.35, 0.4]:
key = '%.4f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
learning_rate= i,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Learning Rate',fontsize=15)>0.1000 0.557 (0.016)
>0.1500 0.584 (0.008)
>0.2000 0.586 (0.005)
>0.2500 0.589 (0.014)
>0.3000 0.591 (0.005)
>0.3500 0.590 (0.010)
>0.4000 0.590 (0.016)
Text(0.5, 0, 'Learning Rate')
The learning rates of [0.2, 0.3, 0.35] seems ideal.
1.3 Reg Lambda
L2 regularization, similar to ridge regression. Increase value will make model more conservative.
1.3.0.0.0.1 Coarse
def get_models():
models = dict()
for i in [0,0.5,1.0,1.5,2,10,100, 500, 1000]:
key = '%.2f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
reg_lambda=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.00 0.594 (0.012)
>0.50 0.595 (0.016)
>1.00 0.591 (0.005)
>1.50 0.582 (0.007)
>2.00 0.588 (0.014)
>10.00 0.585 (0.012)
>100.00 0.536 (0.011)
>500.00 0.490 (0.011)
>1000.00 0.462 (0.019)
Text(0.5, 0, 'reg_lambda')
1.3.0.0.0.2 Fine
def get_models():
models = dict()
for i in [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 5.0, 10]:
key = '%.1f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
reg_lambda=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.5 0.595 (0.016)
>1.0 0.591 (0.005)
>1.5 0.582 (0.007)
>2.0 0.588 (0.014)
>2.5 0.592 (0.012)
>3.0 0.591 (0.007)
>5.0 0.587 (0.010)
>10.0 0.585 (0.012)
Text(0.5, 0, 'reg_lambda')
A reg lambda value of [1.0, 2.5, 3.0] seem ideal.
1.4 Gamma
Specifies the minimum values of the loss reduction (by the loss function) required to make a split. It makes the algorithm more conservative and avoids overfitting.
1.4.0.0.0.1 Coarse
def get_models():
models = dict()
for i in [0,10,1e2,1e3,1e4,1e5,1e6,1e7,1e8,1e9]:
key = '%.0f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
gamma=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0 0.591 (0.005)
>10 0.413 (0.015)
>100 0.210 (0.036)
>1000 0.035 (0.050)
>10000 0.035 (0.050)
>100000 0.035 (0.050)
>1000000 0.035 (0.050)
>10000000 0.035 (0.050)
>100000000 0.035 (0.050)
>1000000000 0.035 (0.050)
Text(0.5, 0, 'reg_lambda')
1.4.0.0.1 Fine
def get_models():
models = dict()
for i in [0, 1, 2, 3, 4, 5]:
key = '%.1f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
gamma=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.0 0.591 (0.005)
>1.0 0.525 (0.015)
>2.0 0.502 (0.017)
>3.0 0.482 (0.031)
>4.0 0.461 (0.021)
>5.0 0.439 (0.024)
Text(0.5, 0, 'reg_lambda')
1.4.0.0.2 Super fine search
def get_models():
models = dict()
for i in [0, 0.1, 0.2, 0.3, 0.4, 0.5]:
key = '%.1f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
gamma=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.0 0.591 (0.005)
>0.1 0.588 (0.006)
>0.2 0.579 (0.021)
>0.3 0.585 (0.010)
>0.4 0.576 (0.007)
>0.5 0.558 (0.018)
Text(0.5, 0, 'reg_lambda')
1.4.0.0.3 Super super fine search
def get_models():
models = dict()
for i in [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]:
key = '%.3f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
gamma=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.000 0.591 (0.005)
>0.010 0.591 (0.018)
>0.020 0.594 (0.012)
>0.030 0.592 (0.009)
>0.040 0.591 (0.013)
>0.050 0.593 (0.011)
>0.060 0.596 (0.014)
>0.070 0.589 (0.010)
>0.080 0.592 (0.012)
>0.090 0.594 (0.007)
>0.100 0.588 (0.006)
Text(0.5, 0, 'reg_lambda')
Gamma [0, 0.04, 0.06, 0.07] seems ideal.
1.5 Subsample
Denotes the fraction of observations to be random samples for each tree. Lower values prevent overfitting by making the more conservative, but too small values may lead to underfitting.
1.5.0.0.1 Coarse
def get_models():
models = dict()
sub = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for s in sub:
key = '%.2f' % s
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
subsample = s,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.10 0.524 (0.025)
>0.20 0.561 (0.013)
>0.30 0.567 (0.009)
>0.40 0.581 (0.015)
>0.50 0.581 (0.010)
>0.60 0.581 (0.012)
>0.70 0.598 (0.016)
>0.80 0.606 (0.015)
>0.90 0.599 (0.015)
>1.00 0.591 (0.005)
Text(0.5, 0, 'reg_lambda')
1.5.0.0.2 Fine
def get_models():
models = dict()
sub = [0.7, 0.75, 0.8, 0.85, 0.9]
for s in sub:
key = '%.2f' % s
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
subsample = s,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.70 0.598 (0.016)
>0.75 0.592 (0.008)
>0.80 0.606 (0.015)
>0.85 0.592 (0.013)
>0.90 0.599 (0.015)
Text(0.5, 0, 'reg_lambda')
Seems like the optimal range for coarse subsample is [0.75, 0.8, 0.85]
1.6 Model Performance
1.6.1 F1-score
start_time = time.time()
param_grid = {'n_estimators':[150],
'max_depth': [7, 9, 10],
'learning_rate': [0.2, 0.3, 0.35],
'gamma': [0, 0.04, 0.06, 0.07],
'reg_lambda':[1.0, 2.5, 3.0],
'subsample': [0.75, 0.8, 0.85]}
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
random_state=1,
use_label_encoder=False),
param_grid = param_grid,
scoring = 'f1_weighted',
verbose = 1,
n_jobs= -1,
cv = cv)
optimal_params.fit(X_train, y_train, sample_weight = weight)
print(optimal_params.best_params_,optimal_params.best_score_)
print("Time taken = ", (time.time()-start_time)/60, " minutes")Fitting 5 folds for each of 324 candidates, totalling 1620 fits
{'gamma': 0.06, 'learning_rate': 0.3, 'max_depth': 9, 'n_estimators': 150, 'reg_lambda': 1.0, 'subsample': 0.8} 0.6061359234970032
Time taken = 16.490493313471475 minutes
1.7 F1
model = xgb.XGBClassifier(objective = 'multi:softmax', random_state = 1, gamma = 0.06, learning_rate = 0.3, max_depth = 9,
n_estimators = 150, reg_lambda = 1.0, subsample = 0.8, num_class= 7, eval_metric=f1_score)model.fit(X_train, y_train, sample_weight = weight)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False,
eval_metric=<function f1_score at 0x000002580E81E3A0>,
feature_types=None, gamma=0.06, gpu_id=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.3, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=9,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=150, n_jobs=None,
num_class=7, num_parallel_tree=None, objective='multi:softmax', ...)
model.score(X_test, y_test)0.6556122448979592
After tuning and fitting the adjusted weight of the model, the F1 score improved to 65.6% (adjusting the sample weight increased the model performance from 60.6% to 65.6%).
1.8 Metrics
from sklearn import metrics
# Print the confusion matrix
print(metrics.confusion_matrix(y_test, y_pred))[[ 1 1 3 3 0 0]
[ 1 16 25 10 1 0]
[ 0 12 399 152 15 0]
[ 0 5 167 619 91 5]
[ 0 2 7 125 220 13]
[ 0 0 1 12 24 30]]
# Print the precision and recall, among other metrics
print(metrics.classification_report(y_test, y_pred, digits=3)) precision recall f1-score support
0 0.500 0.125 0.200 8
1 0.444 0.302 0.360 53
2 0.663 0.690 0.676 578
3 0.672 0.698 0.685 887
4 0.627 0.599 0.613 367
5 0.625 0.448 0.522 67
accuracy 0.656 1960
macro avg 0.589 0.477 0.509 1960
weighted avg 0.652 0.656 0.652 1960
Model was only able to predict 5 classes. This is probably due to the fact that the number of classes in the sixth class is too small. The improving of the class weights does not seem to have completely solved the class imbalance problem.
y_train.unique()array([2, 3, 4, 1, 5, 0, 6], dtype=int64)
y_train.value_counts()3 1311
2 879
4 513
1 110
5 108
0 12
6 5
Name: quality, dtype: int64
Again, we can see that category 6 has a very small number of variable. This is probably why the model was not able to predict the variables.
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Metrics on Test Data
y_pred = model.predict(X_test)
print("Accuracy on Test Data: ", accuracy_score(y_test, y_pred))
print("Precision on Test Data: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall on Test Data: ", recall_score(y_test, y_pred, average='weighted'))
print("F1-score on Test Data: ", f1_score(y_test, y_pred, average='weighted'))Accuracy on Test Data: 0.6556122448979592
Precision on Test Data: 0.6523983776515713
Recall on Test Data: 0.6556122448979592
F1-score on Test Data: 0.6524282997526775
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Metrics on Test Data
y_pred = model.predict(X_train)
print("Accuracy on Test Data: ", accuracy_score(y_train, y_pred))
print("Precision on Test Data: ", precision_score(y_train, y_pred, average='weighted'))
print("Recall on Test Data: ", recall_score(y_train, y_pred, average='weighted'))
print("F1-score on Test Data: ", f1_score(y_train, y_pred, average='weighted'))Accuracy on Test Data: 1.0
Precision on Test Data: 1.0
Recall on Test Data: 1.0
F1-score on Test Data: 1.0
pd.concat([pd.Series(X_train.columns, name = 'predictor'),
pd.Series(model.feature_importances_,
name = 'importance')], axis = 1).sort_values(by = 'importance', ascending=False)| predictor | importance | |
|---|---|---|
| 10 | alcohol | 0.134289 |
| 0 | fixed acidity | 0.125577 |
| 5 | free sulfur dioxide | 0.120736 |
| 8 | pH | 0.100421 |
| 7 | density | 0.094698 |
| 4 | chlorides | 0.090360 |
| 1 | volatile acidity | 0.085564 |
| 3 | residual sugar | 0.076734 |
| 9 | sulphates | 0.061030 |
| 6 | total sulfur dioxide | 0.055689 |
| 2 | citric acid | 0.054903 |
Alcohol is the most important predictor for the model.
1.8.1 Adjusting Threshold (binarization of all columns, finding threshold for each column)
train = pd.concat([X_train, y_train], axis=1)# making dummies for quality
train_dummies = pd.get_dummies(train['quality'])
train_dummies = pd.concat([X_train, train_dummies], axis=1)
train_dummies| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | 0 | 1 | 2 | 3 | 4 | 5 | 6 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.712779 | -0.280214 | -0.034638 | -0.767304 | 0.330849 | -1.370628 | -1.561658 | 0.004221 | -1.975470 | -0.086290 | -0.824276 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 1 | 0.172097 | -0.677101 | -0.447836 | 0.435486 | -0.035355 | 0.099493 | 1.944742 | 0.458979 | 0.408870 | 0.439499 | -0.092863 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | -1.013043 | 0.513561 | -0.613115 | -0.254640 | 0.239298 | -1.135409 | -0.455612 | 0.044347 | -0.253446 | -0.699710 | -1.068080 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 3 | 1.238723 | -0.180992 | -0.034638 | 0.770690 | -0.310008 | 2.216467 | 1.732946 | 0.960550 | -1.710543 | 0.001342 | -0.824276 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 0.053583 | 0.116674 | -0.695755 | 1.145330 | -0.264233 | -0.370946 | -0.055553 | 0.499105 | -0.915763 | 1.403446 | -0.092863 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2933 | -0.776015 | -0.677101 | 1.535513 | 0.021411 | 0.056196 | -0.429751 | -0.361481 | 0.051034 | 0.210175 | 0.001342 | -0.499203 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2934 | 0.172097 | -1.371654 | -0.117278 | 0.514358 | -0.310008 | 1.099175 | 0.062111 | 0.525855 | 0.210175 | -0.524447 | -0.905544 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2935 | 0.883181 | -0.081770 | 1.535513 | -0.629279 | -0.126906 | -0.429751 | 0.320973 | -0.915326 | -1.114458 | 0.351868 | 0.719818 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2936 | -0.657501 | -0.379435 | -0.365197 | 1.441098 | -0.493110 | 1.040370 | -0.102619 | 0.549262 | -0.120983 | -0.962605 | 0.069674 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2937 | 1.475751 | -0.081770 | 0.130641 | -0.925047 | -0.721988 | 0.158298 | -0.832138 | -0.494006 | 0.210175 | 1.228183 | -0.092863 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2938 rows × 18 columns
# Making a dataset for each score
y_train_0 = train_dummies[0]
y_train_1 = train_dummies[1]
y_train_2 = train_dummies[2]
y_train_3 = train_dummies[3]
y_train_4 = train_dummies[4]
y_train_5 = train_dummies[5]
y_train_6 = train_dummies[6]1.8.1.1 Adjusting Individual Thresholds
1.8.1.1.1 y_train_0
cross_val_ypred = cross_val_predict(model, X_train, y_train_0, cv = 5, method = 'predict_proba')
# create an entire for loop
p, r, thresholds = precision_recall_curve(y_train_0, cross_val_ypred[:,1])
accuracy_scores = []
for thresh in thresholds:
y_pred = cross_val_ypred[:,1] > thresh
y_pred = y_pred.astype(int)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores as a function of the decision threshold")
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.plot(thresholds, precisions[:-1], "o", color = 'blue')
plt.plot(thresholds, recalls[:-1], "o", color = 'green')
plt.ylabel("Score")
plt.xlabel("Decision Threshold")
plt.legend(loc='best')
plt.legend()
plot_precision_recall_vs_threshold(p, r, thresholds)Seems like the desired threshold is of around 0.023 for the category 0 (quality of 3).
# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Recall', 'F1 Score'])
y_pred_prob = model.predict_proba(X_test)[:,1]
y_pred = y_pred_prob > 0.023
y_pred = y_pred.astype(int)
# Compute the f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
f10.0069715399383450255
The F1 score did not improve with the threshold found for the first class.
1.8.1.1.2 y_train_1
cross_val_ypred = cross_val_predict(model, X_train, y_train_1, cv = 5, method = 'predict_proba')
# create an entire for loop
p, r, thresholds = precision_recall_curve(y_train_1, cross_val_ypred[:,1])
accuracy_scores = []
for thresh in thresholds:
y_pred = cross_val_ypred[:,1] > thresh
y_pred = y_pred.astype(int)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores as a function of the decision threshold")
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.plot(thresholds, precisions[:-1], "o", color = 'blue')
plt.plot(thresholds, recalls[:-1], "o", color = 'green')
plt.ylabel("Score")
plt.xlabel("Decision Threshold")
plt.legend(loc='best')
plt.legend()
plot_precision_recall_vs_threshold(p, r, thresholds)Seems like the desired threshold is of around 0.008 for the category 1 (quality of 4).
# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Recall', 'F1 Score'])
y_pred_prob = model.predict_proba(X_test)[:,1]
y_pred = y_pred_prob > 0.008
y_pred = y_pred.astype(int)
# Compute the f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
f10.00512505281585628
The F1 score did not improve with the threshold found for the second class.
1.8.1.1.3 y_train_2
cross_val_ypred = cross_val_predict(model, X_train, y_train_2, cv = 5, method = 'predict_proba')
# create an entire for loop
p, r, thresholds = precision_recall_curve(y_train_2, cross_val_ypred[:,1])
accuracy_scores = []
for thresh in thresholds:
y_pred = cross_val_ypred[:,1] > thresh
y_pred = y_pred.astype(int)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores as a function of the decision threshold")
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.plot(thresholds, precisions[:-1], "o", color = 'blue')
plt.plot(thresholds, recalls[:-1], "o", color = 'green')
plt.ylabel("Score")
plt.xlabel("Decision Threshold")
plt.legend(loc='best')
plt.legend()
plot_precision_recall_vs_threshold(p, r, thresholds)Seems like the desired threshold is of around 0.4 for the category 2 (quality of 5).
# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Recall', 'F1 Score'])
y_pred_prob = model.predict_proba(X_test)[:,1]
y_pred = y_pred_prob > 0.4
y_pred = y_pred.astype(int)
# Compute the f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
f10.009043197807346656
The F1 score did not improve with the threshold found for the third class.
1.8.1.1.4 y_train_3
cross_val_ypred = cross_val_predict(model, X_train, y_train_3, cv = 5, method = 'predict_proba')
# create an entire for loop
p, r, thresholds = precision_recall_curve(y_train_3, cross_val_ypred[:,1])
accuracy_scores = []
for thresh in thresholds:
y_pred = cross_val_ypred[:,1] > thresh
y_pred = y_pred.astype(int)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores as a function of the decision threshold")
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.plot(thresholds, precisions[:-1], "o", color = 'blue')
plt.plot(thresholds, recalls[:-1], "o", color = 'green')
plt.ylabel("Score")
plt.xlabel("Decision Threshold")
plt.legend(loc='best')
plt.legend()
plot_precision_recall_vs_threshold(p, r, thresholds)Seems like the desired threshold is of around 0.44 for the category 3 (quality of 6).
# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Recall', 'F1 Score'])
y_pred_prob = model.predict_proba(X_test)[:,1]
y_pred = y_pred_prob > 0.44
y_pred = y_pred.astype(int)
# Compute the f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
f10.009462374096925592
The F1 score did not improve with the threshold found for the fourth class.
1.8.1.1.5 y_train_4
cross_val_ypred = cross_val_predict(model, X_train, y_train_4, cv = 5, method = 'predict_proba')
# create an entire for loop
p, r, thresholds = precision_recall_curve(y_train_4, cross_val_ypred[:,1])
accuracy_scores = []
for thresh in thresholds:
y_pred = cross_val_ypred[:,1] > thresh
y_pred = y_pred.astype(int)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores as a function of the decision threshold")
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.plot(thresholds, precisions[:-1], "o", color = 'blue')
plt.plot(thresholds, recalls[:-1], "o", color = 'green')
plt.ylabel("Score")
plt.xlabel("Decision Threshold")
plt.legend(loc='best')
plt.legend()
plot_precision_recall_vs_threshold(p, r, thresholds)Seems like the desired threshold is of around 0.28 for the category 4 (quality of 7).
# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Recall', 'F1 Score'])
y_pred_prob = model.predict_proba(X_test)[:,1]
y_pred = y_pred_prob > 0.28
y_pred = y_pred.astype(int)
# Compute the f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
f10.01017023952063801
The F1 score did not improve with the threshold found for the fifth class.
1.8.1.1.6 y_train_5
cross_val_ypred = cross_val_predict(model, X_train, y_train_5, cv = 5, method = 'predict_proba')
# create an entire for loop
p, r, thresholds = precision_recall_curve(y_train_5, cross_val_ypred[:,1])
accuracy_scores = []
for thresh in thresholds:
y_pred = cross_val_ypred[:,1] > thresh
y_pred = y_pred.astype(int)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores as a function of the decision threshold")
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.plot(thresholds, precisions[:-1], "o", color = 'blue')
plt.plot(thresholds, recalls[:-1], "o", color = 'green')
plt.ylabel("Score")
plt.xlabel("Decision Threshold")
plt.legend(loc='best')
plt.legend()
plot_precision_recall_vs_threshold(p, r, thresholds)Seems like the desired threshold is of around 0.08 for the category 5 (quality of 8).
# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Recall', 'F1 Score'])
y_pred_prob = model.predict_proba(X_test)[:,1]
y_pred = y_pred_prob > 0.08
y_pred = y_pred.astype(int)
# Compute the f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
f10.00904012390012854
The F1 score did not improve with the threshold found for the sixth class.
1.8.1.1.7 y_train_6
cross_val_ypred = cross_val_predict(model, X_train, y_train_6, cv = 5, method = 'predict_proba')
# create an entire for loop
p, r, thresholds = precision_recall_curve(y_train_6, cross_val_ypred[:,1])
accuracy_scores = []
for thresh in thresholds:
y_pred = cross_val_ypred[:,1] > thresh
y_pred = y_pred.astype(int)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores as a function of the decision threshold")
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.plot(thresholds, precisions[:-1], "o", color = 'blue')
plt.plot(thresholds, recalls[:-1], "o", color = 'green')
plt.ylabel("Score")
plt.xlabel("Decision Threshold")
plt.legend(loc='best')
plt.legend()
plot_precision_recall_vs_threshold(p, r, thresholds)def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores as a function of the decision threshold")
plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
plt.plot(thresholds, precisions[:-1], "o", color = 'blue')
plt.plot(thresholds, recalls[:-1], "o", color = 'green')
plt.ylabel("Score")
plt.xlabel("Decision Threshold")
plt.legend(loc='best')
plt.legend()
# Set x-axis limits for the desired threshold range
plt.xlim(0, 0.1)
plt.show()
plot_precision_recall_vs_threshold(p, r, thresholds)Seems like the desired threshold is of around 0.02 for the category 6 (quality of 9).
# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Recall', 'F1 Score'])
y_pred_prob = model.predict_proba(X_test)[:,1]
y_pred = y_pred_prob > 0.02
y_pred = y_pred.astype(int)
# Compute the f1 score
f1 = f1_score(y_test, y_pred, average='weighted')
f10.006653216668353206
The F1 score did not improve with the threshold found for the seventh class.
1.8.1.2 Getting the threshold for the different thresholds for different classes and assessing the improvement of the model does not seem to be beneficial in the case of the F1 score.
2 Getting precision and recall of thresholds for whole data
# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Threshold', 'Precision', 'Recall', 'F1 Score'])
for thresh in thresholds:
y_pred_prob = model.predict_proba(X_test)[:,1]
# Classifying observations in the positive class (y = 1) if the predicted probability is greater
# than the desired decision threshold probability
y_pred = y_pred_prob > thresh
y_pred = y_pred.astype(int)
# Compute the precision and recall
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# Store the threshold, precision, recall, and F1 score values in the DataFrame
results_df = results_df.append({'Threshold': thresh, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}, ignore_index=True)# Print the DataFrame with results
print(results_df) Threshold Precision Recall F1 Score
0 0.000023 0.000737 0.027041 0.001435
1 0.000023 0.000737 0.027041 0.001435
2 0.000023 0.000737 0.027041 0.001435
3 0.000023 0.000737 0.027041 0.001435
4 0.000023 0.000737 0.027041 0.001435
... ... ... ... ...
1196 0.028786 0.004646 0.021939 0.007467
1197 0.030803 0.004786 0.021429 0.007603
1198 0.033422 0.004943 0.020918 0.007754
1199 0.072617 0.006269 0.018878 0.008992
1200 0.61737 0.014575 0.010714 0.009614
[1201 rows x 4 columns]
# Convert 'F1 Score' column to numeric data type
results_df['F1 Score'] = pd.to_numeric(results_df['F1 Score'])
# Find the highest F1 score and its corresponding threshold
max_f1_score = results_df['F1 Score'].max()
best_threshold = results_df.loc[results_df['F1 Score'].idxmax(), 'Threshold']
# Print the highest F1 score and its corresponding threshold
print("Highest F1 Score:", max_f1_score)
print("Corresponding Threshold:", best_threshold)Highest F1 Score: 0.009613511545488234
Corresponding Threshold: 0.6173699498176575
This method did not yeild in a high F1 score.
2.0.0.0.0.1 Adjusting the threshold does not seem to have helped the model
2.0.1 Early Stopping
Next, I tried to implement early stopping to see if the latter would improve the F1 score.
X_train_sub = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\X_train.csv', sep=',')
X_test_sub = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\X_test.csv', sep=',')
y_train_sub = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\y_train.csv', sep=',')
y_test_sub = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\y_test.csv', sep=',')# mapping the variables from 0 to 7 (the model does not accept the standard format)
num = {3: 0,
4: 1,
5: 2,
6: 3,
7: 4,
8: 5,
9: 6}
y_train_sub = y_train_sub['quality'].map(num)
y_test_sub = y_test_sub['quality'].map(num)y_train_sub0 2
1 3
2 2
3 3
4 3
..
2933 3
2934 3
2935 3
2936 3
2937 5
Name: quality, Length: 2938, dtype: int64
model = xgb.XGBClassifier(objective = 'multi:softmax',random_state=1,gamma=0.04,learning_rate = 0.25,max_depth=10,
n_estimators = 150,reg_lambda = 2.5 ,scale_pos_weight=0.75, num_class=7, early_stopping_rounds=250, eval_metric=lambda x,y : f1_score(x,y,average='weighted'),
seed=45, average = 'weighted')
modelXGBClassifier(average='weighted', base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=250,
enable_categorical=False,
eval_metric=<function <lambda> at 0x000001E655640940>,
feature_types=None, gamma=0.04, gpu_id=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.25, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=150, n_jobs=None,
num_class=7, num_parallel_tree=None, ...)
model.fit(X_train_sub, y_train_sub, eval_set = ([(X_test_sub, y_test_sub)]))[00:23:38] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\learner.cc:767:
Parameters: { "average", "scale_pos_weight" } are not used.
[0] validation_0-mlogloss:1.67495 validation_0-<lambda>:0.53499
[1] validation_0-mlogloss:1.50632 validation_0-<lambda>:0.55004
[2] validation_0-mlogloss:1.38406 validation_0-<lambda>:0.56216
[3] validation_0-mlogloss:1.29445 validation_0-<lambda>:0.57133
[4] validation_0-mlogloss:1.22050 validation_0-<lambda>:0.57796
[5] validation_0-mlogloss:1.16391 validation_0-<lambda>:0.58441
[6] validation_0-mlogloss:1.11866 validation_0-<lambda>:0.59949
[7] validation_0-mlogloss:1.08099 validation_0-<lambda>:0.60166
[8] validation_0-mlogloss:1.04753 validation_0-<lambda>:0.61351
[9] validation_0-mlogloss:1.02144 validation_0-<lambda>:0.61290
[10] validation_0-mlogloss:0.99977 validation_0-<lambda>:0.61994
[11] validation_0-mlogloss:0.98149 validation_0-<lambda>:0.61972
[12] validation_0-mlogloss:0.96586 validation_0-<lambda>:0.62033
[13] validation_0-mlogloss:0.95374 validation_0-<lambda>:0.62073
[14] validation_0-mlogloss:0.94250 validation_0-<lambda>:0.63046
[15] validation_0-mlogloss:0.93267 validation_0-<lambda>:0.63269
[16] validation_0-mlogloss:0.92689 validation_0-<lambda>:0.63740
[17] validation_0-mlogloss:0.92109 validation_0-<lambda>:0.63560
[18] validation_0-mlogloss:0.91699 validation_0-<lambda>:0.63884
[19] validation_0-mlogloss:0.91321 validation_0-<lambda>:0.64097
[20] validation_0-mlogloss:0.91021 validation_0-<lambda>:0.64114
[21] validation_0-mlogloss:0.90680 validation_0-<lambda>:0.64062
[22] validation_0-mlogloss:0.90262 validation_0-<lambda>:0.64235
[23] validation_0-mlogloss:0.90013 validation_0-<lambda>:0.64101
[24] validation_0-mlogloss:0.89662 validation_0-<lambda>:0.64638
[25] validation_0-mlogloss:0.89376 validation_0-<lambda>:0.64436
[26] validation_0-mlogloss:0.89220 validation_0-<lambda>:0.64409
[27] validation_0-mlogloss:0.88995 validation_0-<lambda>:0.65077
[28] validation_0-mlogloss:0.89009 validation_0-<lambda>:0.65182
[29] validation_0-mlogloss:0.89040 validation_0-<lambda>:0.65327
[30] validation_0-mlogloss:0.89018 validation_0-<lambda>:0.65285
[31] validation_0-mlogloss:0.89019 validation_0-<lambda>:0.65193
[32] validation_0-mlogloss:0.89038 validation_0-<lambda>:0.65151
[33] validation_0-mlogloss:0.89017 validation_0-<lambda>:0.65140
[34] validation_0-mlogloss:0.88928 validation_0-<lambda>:0.65318
[35] validation_0-mlogloss:0.88889 validation_0-<lambda>:0.65638
[36] validation_0-mlogloss:0.89014 validation_0-<lambda>:0.65868
[37] validation_0-mlogloss:0.89172 validation_0-<lambda>:0.65488
[38] validation_0-mlogloss:0.89205 validation_0-<lambda>:0.65402
[39] validation_0-mlogloss:0.89335 validation_0-<lambda>:0.65637
[40] validation_0-mlogloss:0.89435 validation_0-<lambda>:0.65586
[41] validation_0-mlogloss:0.89477 validation_0-<lambda>:0.65644
[42] validation_0-mlogloss:0.89611 validation_0-<lambda>:0.65317
[43] validation_0-mlogloss:0.89843 validation_0-<lambda>:0.65624
[44] validation_0-mlogloss:0.89959 validation_0-<lambda>:0.65330
[45] validation_0-mlogloss:0.90076 validation_0-<lambda>:0.65630
[46] validation_0-mlogloss:0.90069 validation_0-<lambda>:0.65571
[47] validation_0-mlogloss:0.90143 validation_0-<lambda>:0.65380
[48] validation_0-mlogloss:0.90219 validation_0-<lambda>:0.65277
[49] validation_0-mlogloss:0.90409 validation_0-<lambda>:0.65442
[50] validation_0-mlogloss:0.90475 validation_0-<lambda>:0.65343
[51] validation_0-mlogloss:0.90656 validation_0-<lambda>:0.65351
[52] validation_0-mlogloss:0.90719 validation_0-<lambda>:0.65504
[53] validation_0-mlogloss:0.90774 validation_0-<lambda>:0.65587
[54] validation_0-mlogloss:0.90942 validation_0-<lambda>:0.65779
[55] validation_0-mlogloss:0.91075 validation_0-<lambda>:0.65556
[56] validation_0-mlogloss:0.91293 validation_0-<lambda>:0.65823
[57] validation_0-mlogloss:0.91374 validation_0-<lambda>:0.65761
[58] validation_0-mlogloss:0.91650 validation_0-<lambda>:0.65639
[59] validation_0-mlogloss:0.91769 validation_0-<lambda>:0.65673
[60] validation_0-mlogloss:0.91904 validation_0-<lambda>:0.65666
[61] validation_0-mlogloss:0.91988 validation_0-<lambda>:0.65674
[62] validation_0-mlogloss:0.92188 validation_0-<lambda>:0.65677
[63] validation_0-mlogloss:0.92258 validation_0-<lambda>:0.65871
[64] validation_0-mlogloss:0.92383 validation_0-<lambda>:0.66164
[65] validation_0-mlogloss:0.92503 validation_0-<lambda>:0.66079
[66] validation_0-mlogloss:0.92672 validation_0-<lambda>:0.66022
[67] validation_0-mlogloss:0.92787 validation_0-<lambda>:0.65930
[68] validation_0-mlogloss:0.92890 validation_0-<lambda>:0.65839
[69] validation_0-mlogloss:0.92936 validation_0-<lambda>:0.65837
[70] validation_0-mlogloss:0.92966 validation_0-<lambda>:0.65700
[71] validation_0-mlogloss:0.93204 validation_0-<lambda>:0.65809
[72] validation_0-mlogloss:0.93387 validation_0-<lambda>:0.65971
[73] validation_0-mlogloss:0.93621 validation_0-<lambda>:0.65765
[74] validation_0-mlogloss:0.93732 validation_0-<lambda>:0.65868
[75] validation_0-mlogloss:0.93981 validation_0-<lambda>:0.65979
[76] validation_0-mlogloss:0.94075 validation_0-<lambda>:0.65766
[77] validation_0-mlogloss:0.94144 validation_0-<lambda>:0.65836
[78] validation_0-mlogloss:0.94284 validation_0-<lambda>:0.65731
[79] validation_0-mlogloss:0.94444 validation_0-<lambda>:0.65779
[80] validation_0-mlogloss:0.94626 validation_0-<lambda>:0.65784
[81] validation_0-mlogloss:0.94759 validation_0-<lambda>:0.65600
[82] validation_0-mlogloss:0.94943 validation_0-<lambda>:0.65500
[83] validation_0-mlogloss:0.95030 validation_0-<lambda>:0.65595
[84] validation_0-mlogloss:0.95227 validation_0-<lambda>:0.65387
[85] validation_0-mlogloss:0.95400 validation_0-<lambda>:0.65470
[86] validation_0-mlogloss:0.95598 validation_0-<lambda>:0.65471
[87] validation_0-mlogloss:0.95635 validation_0-<lambda>:0.65312
[88] validation_0-mlogloss:0.95773 validation_0-<lambda>:0.65201
[89] validation_0-mlogloss:0.95915 validation_0-<lambda>:0.65404
[90] validation_0-mlogloss:0.96018 validation_0-<lambda>:0.65449
[91] validation_0-mlogloss:0.96204 validation_0-<lambda>:0.65444
[92] validation_0-mlogloss:0.96420 validation_0-<lambda>:0.65404
[93] validation_0-mlogloss:0.96534 validation_0-<lambda>:0.65461
[94] validation_0-mlogloss:0.96652 validation_0-<lambda>:0.65767
[95] validation_0-mlogloss:0.96667 validation_0-<lambda>:0.65524
[96] validation_0-mlogloss:0.96739 validation_0-<lambda>:0.65623
[97] validation_0-mlogloss:0.96780 validation_0-<lambda>:0.65637
[98] validation_0-mlogloss:0.96829 validation_0-<lambda>:0.65540
[99] validation_0-mlogloss:0.96967 validation_0-<lambda>:0.65636
[100] validation_0-mlogloss:0.97082 validation_0-<lambda>:0.65435
[101] validation_0-mlogloss:0.97195 validation_0-<lambda>:0.65735
[102] validation_0-mlogloss:0.97280 validation_0-<lambda>:0.65641
[103] validation_0-mlogloss:0.97325 validation_0-<lambda>:0.65739
[104] validation_0-mlogloss:0.97413 validation_0-<lambda>:0.65632
[105] validation_0-mlogloss:0.97475 validation_0-<lambda>:0.65843
[106] validation_0-mlogloss:0.97637 validation_0-<lambda>:0.65683
[107] validation_0-mlogloss:0.97689 validation_0-<lambda>:0.65690
[108] validation_0-mlogloss:0.97823 validation_0-<lambda>:0.65640
[109] validation_0-mlogloss:0.97930 validation_0-<lambda>:0.65590
[110] validation_0-mlogloss:0.98029 validation_0-<lambda>:0.65786
[111] validation_0-mlogloss:0.98169 validation_0-<lambda>:0.65689
[112] validation_0-mlogloss:0.98271 validation_0-<lambda>:0.65638
[113] validation_0-mlogloss:0.98368 validation_0-<lambda>:0.65638
[114] validation_0-mlogloss:0.98408 validation_0-<lambda>:0.65841
[115] validation_0-mlogloss:0.98503 validation_0-<lambda>:0.65744
[116] validation_0-mlogloss:0.98600 validation_0-<lambda>:0.65639
[117] validation_0-mlogloss:0.98646 validation_0-<lambda>:0.65543
[118] validation_0-mlogloss:0.98760 validation_0-<lambda>:0.65686
[119] validation_0-mlogloss:0.98831 validation_0-<lambda>:0.65582
[120] validation_0-mlogloss:0.98901 validation_0-<lambda>:0.65534
[121] validation_0-mlogloss:0.98980 validation_0-<lambda>:0.65443
[122] validation_0-mlogloss:0.99053 validation_0-<lambda>:0.65496
[123] validation_0-mlogloss:0.99126 validation_0-<lambda>:0.65600
[124] validation_0-mlogloss:0.99163 validation_0-<lambda>:0.65408
[125] validation_0-mlogloss:0.99267 validation_0-<lambda>:0.65551
[126] validation_0-mlogloss:0.99306 validation_0-<lambda>:0.65698
[127] validation_0-mlogloss:0.99380 validation_0-<lambda>:0.65691
[128] validation_0-mlogloss:0.99483 validation_0-<lambda>:0.65698
[129] validation_0-mlogloss:0.99552 validation_0-<lambda>:0.65650
[130] validation_0-mlogloss:0.99721 validation_0-<lambda>:0.65450
[131] validation_0-mlogloss:0.99782 validation_0-<lambda>:0.65493
[132] validation_0-mlogloss:0.99831 validation_0-<lambda>:0.65560
[133] validation_0-mlogloss:0.99917 validation_0-<lambda>:0.65544
[134] validation_0-mlogloss:1.00014 validation_0-<lambda>:0.65446
[135] validation_0-mlogloss:1.00152 validation_0-<lambda>:0.65446
[136] validation_0-mlogloss:1.00188 validation_0-<lambda>:0.65449
[137] validation_0-mlogloss:1.00215 validation_0-<lambda>:0.65500
[138] validation_0-mlogloss:1.00233 validation_0-<lambda>:0.65547
[139] validation_0-mlogloss:1.00351 validation_0-<lambda>:0.65450
[140] validation_0-mlogloss:1.00404 validation_0-<lambda>:0.65504
[141] validation_0-mlogloss:1.00491 validation_0-<lambda>:0.65652
[142] validation_0-mlogloss:1.00553 validation_0-<lambda>:0.65505
[143] validation_0-mlogloss:1.00715 validation_0-<lambda>:0.65652
[144] validation_0-mlogloss:1.00727 validation_0-<lambda>:0.65603
[145] validation_0-mlogloss:1.00788 validation_0-<lambda>:0.65503
[146] validation_0-mlogloss:1.00834 validation_0-<lambda>:0.65507
[147] validation_0-mlogloss:1.00881 validation_0-<lambda>:0.65613
[148] validation_0-mlogloss:1.00898 validation_0-<lambda>:0.65613
[149] validation_0-mlogloss:1.00970 validation_0-<lambda>:0.65554
XGBClassifier(average='weighted', base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=250,
enable_categorical=False,
eval_metric=<function <lambda> at 0x000001E655640940>,
feature_types=None, gamma=0.04, gpu_id=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.25, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=150, n_jobs=None,
num_class=7, num_parallel_tree=None, ...)
model.score(X_test,y_test)0.5535714285714286
2.0.1.0.1 Early stopping does not seem to help the model performance as the F1 score is much smaller than the one without early stopping.
3 Model after feature transformation
As previously discussed, some of the features are skewed (specifically, residual sugar). In order to take that into account, a log transform of some of the features was implemented (along with retunning the model).
X_train = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\X_train.csv', sep=',')
X_test = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\X_test.csv', sep=',')
y_train = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\y_train.csv', sep=',')
y_test = pd.read_csv(r'C:\Users\Yasmeen\Documents\STAT 303-3\project\y_test.csv', sep=',')3.1 XG Boost Model
# mapping the variables from 0 to 7 (the model does not accept the standard format)
num = {3: 0,
4: 1,
5: 2,
6: 3,
7: 4,
8: 5,
9: 6}
y_train = y_train['quality'].map(num)
y_test = y_test['quality'].map(num)3.2 Transforming Feature
Residula sugar was the most prominently skewed variable, so I decided only the log transform for residual sugar. All of these different combinations were ran and there was no improvement in the accuracy and F1 score of the model.
#train
X_train['residual sugar'] = np.log(X_train['residual sugar'])
#X_train['chlorides'] = np.log(X_train['chlorides'])
#X_train['sulphates'] = np.log(X_train['sulphates'])
#test
X_test['residual sugar'] = np.log(X_test['residual sugar'])
#X_test['chlorides'] = np.log(X_test['chlorides'])
#X_test['sulphates'] = np.log(X_test['sulphates'])sns.distplot(X_train['residual sugar'])<AxesSubplot:xlabel='residual sugar', ylabel='Density'>
I will now repeat the whole process of running the model and tuning it using the new transformed variables.
3.3 Balancing Weights of Classes (with transformed variables)
# taking weights into account
def CreateBalancedSampleWeights(y_train, largest_class_weight_coef):
classes = np.unique(y_train, axis = 0)
classes.sort()
class_samples = np.bincount(y_train)
total_samples = class_samples.sum()
n_classes = len(class_samples)
weights = total_samples / (n_classes * class_samples * 1.0)
class_weight_dict = {key : value for (key, value) in zip(classes, weights)}
class_weight_dict[classes[1]] = class_weight_dict[classes[1]] * largest_class_weight_coef
sample_weights = [class_weight_dict[y] for y in y_train]
return sample_weightslargest_class_weight_coef = max(y_train.value_counts().values)/y_train.shape[0]
#pass y_train as numpy array
weight = CreateBalancedSampleWeights(y_train, largest_class_weight_coef)len(weight)2938
y_train0 2
1 3
2 2
3 3
4 3
..
2933 3
2934 3
2935 3
2936 3
2937 5
Name: quality, Length: 2938, dtype: int64
3.3.1 Creating model (with transformed variables)
# Create XGBoost classifier with the custom evaluation metric
model = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
seed=45)
model.fit(X_train, y_train, sample_weight = weight)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False,
eval_metric=<function f1_score at 0x00000295BED9F3A0>,
feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=None, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_class=7, num_parallel_tree=None, objective='multi:softmax', ...)
pd.concat([pd.Series(X_train.columns, name = 'predictor'),
pd.Series(model.feature_importances_,
name = 'importance')], axis = 1).sort_values(by = 'importance', ascending=False)| predictor | importance | |
|---|---|---|
| 10 | alcohol | 0.152676 |
| 0 | fixed acidity | 0.143950 |
| 5 | free sulfur dioxide | 0.119312 |
| 7 | density | 0.104607 |
| 8 | pH | 0.088572 |
| 1 | volatile acidity | 0.081902 |
| 4 | chlorides | 0.075309 |
| 3 | residual sugar | 0.062078 |
| 6 | total sulfur dioxide | 0.060991 |
| 9 | sulphates | 0.059429 |
| 2 | citric acid | 0.051174 |
4 Tuning of Parameters (with transformed variables)
4.0.1 Number of Trees
4.0.1.1 Coarse
def get_models():
models = dict()
# define number of trees to consider
n_trees = [5, 10, 50, 100, 500, 1000, 2000, 5000]
for n in n_trees:
models[str(n)] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
n_estimators=n,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Number of trees',fontsize=15)>5 0.457 (0.021)
>10 0.501 (0.015)
>50 0.583 (0.009)
>100 0.590 (0.011)
>500 0.592 (0.015)
>1000 0.591 (0.015)
>2000 0.593 (0.012)
>5000 0.590 (0.010)
Text(0.5, 0, 'Number of trees')
4.0.1.2 Fine
def get_models():
models = dict()
# define number of trees to consider
n_trees = [60, 80, 100, 150, 200]
for n in n_trees:
models[str(n)] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
n_estimators=n,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Number of trees',fontsize=15)>60 0.589 (0.009)
>80 0.597 (0.008)
>100 0.590 (0.011)
>150 0.593 (0.016)
>200 0.593 (0.017)
Text(0.5, 0, 'Number of trees')
An ideal range for the number of trees to be used in the model is around [80].
4.1 Depth
4.1.0.0.0.1 Coarse
# get a list of models to evaluate
def get_models():
models = dict()
# explore depths from 1 to 10
depth = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
for n in depth:
models[str(n)] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
max_depth=n,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Number of trees',fontsize=15)>3 0.532 (0.014)
>4 0.565 (0.014)
>5 0.586 (0.005)
>6 0.590 (0.011)
>7 0.593 (0.010)
>8 0.600 (0.014)
>9 0.600 (0.014)
>10 0.596 (0.009)
>11 0.594 (0.010)
>12 0.598 (0.018)
>13 0.593 (0.015)
Text(0.5, 0, 'Number of trees')
4.1.0.0.1 Fine
# get a list of models to evaluate
def get_models():
models = dict()
# explore depths from 1 to 10
depth = [6, 7, 8, 9, 10]
for n in depth:
models[str(n)] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
max_depth=n,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Number of trees',fontsize=15)>6 0.590 (0.011)
>7 0.593 (0.010)
>8 0.600 (0.014)
>9 0.600 (0.014)
>10 0.596 (0.009)
Text(0.5, 0, 'Number of trees')
The ideal range of the depth of trees that seems ideal is [8, 9].
4.2 Learning Rate
# get a list of models to evaluate
def get_models():
models = dict()
for i in [0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.8,1.0]:
key = '%.4f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
learning_rate= i,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Learning Rate',fontsize=15)>0.0100 0.433 (0.017)
>0.0500 0.510 (0.012)
>0.1000 0.562 (0.015)
>0.2000 0.592 (0.004)
>0.3000 0.590 (0.011)
>0.4000 0.595 (0.016)
>0.5000 0.591 (0.009)
>0.6000 0.587 (0.016)
>0.8000 0.585 (0.016)
>1.0000 0.587 (0.008)
Text(0.5, 0, 'Learning Rate')
4.2.0.0.0.1 Fine
# get a list of models to evaluate
def get_models():
models = dict()
for i in [0.1, 0.15, 0.2, 0.25,0.3, 0.35, 0.4]:
key = '%.4f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
learning_rate= i,
random_state = 1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('Learning Rate',fontsize=15)>0.1000 0.562 (0.015)
>0.1500 0.584 (0.014)
>0.2000 0.592 (0.004)
>0.2500 0.588 (0.006)
>0.3000 0.590 (0.011)
>0.3500 0.593 (0.014)
>0.4000 0.595 (0.016)
Text(0.5, 0, 'Learning Rate')
The learning rates of [0.2, 0.3, 0.35] seems ideal.
4.3 Reg Lambda
L2 regularization, similar to ridge regression. Increase value will make model more conservative.
4.3.0.0.0.1 Coarse
def get_models():
models = dict()
for i in [0,0.5,1.0,1.5,2,10,100, 500, 1000]:
key = '%.2f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
reg_lambda=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.00 0.597 (0.013)
>0.50 0.587 (0.011)
>1.00 0.590 (0.011)
>1.50 0.589 (0.014)
>2.00 0.589 (0.008)
>10.00 0.582 (0.002)
>100.00 0.542 (0.012)
>500.00 0.491 (0.010)
>1000.00 0.471 (0.012)
Text(0.5, 0, 'reg_lambda')
4.3.0.0.0.2 Fine
def get_models():
models = dict()
for i in [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 5.0, 10]:
key = '%.1f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
reg_lambda=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.5 0.587 (0.011)
>1.0 0.590 (0.011)
>1.5 0.589 (0.014)
>2.0 0.589 (0.008)
>2.5 0.588 (0.019)
>3.0 0.590 (0.009)
>5.0 0.588 (0.012)
>10.0 0.582 (0.002)
Text(0.5, 0, 'reg_lambda')
A reg lambda value of [1.0, 2.5, 3.0] seem ideal.
4.4 Gamma
Specifies the minimum values of the loss reduction (by the loss function) required to make a split. It makes the algorithm more conservative and avoids overfitting.
4.4.0.0.0.1 Coarse
def get_models():
models = dict()
for i in [0,10,1e2,1e3,1e4,1e5,1e6,1e7,1e8,1e9]:
key = '%.0f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
gamma=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0 0.590 (0.011)
>10 0.417 (0.011)
>100 0.211 (0.038)
>1000 0.035 (0.050)
>10000 0.035 (0.050)
>100000 0.035 (0.050)
>1000000 0.035 (0.050)
>10000000 0.035 (0.050)
>100000000 0.035 (0.050)
>1000000000 0.035 (0.050)
Text(0.5, 0, 'reg_lambda')
4.4.0.0.1 Fine
def get_models():
models = dict()
for i in [0, 1, 2, 3, 4, 5]:
key = '%.1f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
gamma=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.0 0.590 (0.011)
>1.0 0.537 (0.011)
>2.0 0.512 (0.016)
>3.0 0.481 (0.015)
>4.0 0.469 (0.011)
>5.0 0.439 (0.016)
Text(0.5, 0, 'reg_lambda')
4.4.0.0.2 Super fine search
def get_models():
models = dict()
for i in [0, 0.1, 0.2, 0.3, 0.4, 0.5]:
key = '%.1f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
gamma=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.0 0.590 (0.011)
>0.1 0.594 (0.016)
>0.2 0.590 (0.012)
>0.3 0.588 (0.011)
>0.4 0.576 (0.016)
>0.5 0.562 (0.013)
Text(0.5, 0, 'reg_lambda')
4.4.0.0.3 Super super fine search
def get_models():
models = dict()
for i in [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]:
key = '%.3f' % i
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
gamma=i,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.000 0.590 (0.011)
>0.010 0.588 (0.006)
>0.020 0.593 (0.012)
>0.030 0.589 (0.010)
>0.040 0.595 (0.012)
>0.050 0.591 (0.005)
>0.060 0.595 (0.008)
>0.070 0.599 (0.011)
>0.080 0.593 (0.014)
>0.090 0.588 (0.013)
>0.100 0.594 (0.016)
Text(0.5, 0, 'reg_lambda')
Gamma [0.07, 0.08] seems ideal.
4.5 Subsample
4.5.0.0.1 Coarse
def get_models():
models = dict()
sub = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for s in sub:
key = '%.2f' % s
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
subsample = s,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.10 0.520 (0.017)
>0.20 0.553 (0.013)
>0.30 0.583 (0.012)
>0.40 0.588 (0.008)
>0.50 0.587 (0.013)
>0.60 0.592 (0.014)
>0.70 0.592 (0.013)
>0.80 0.582 (0.012)
>0.90 0.600 (0.017)
>1.00 0.590 (0.011)
Text(0.5, 0, 'reg_lambda')
4.5.0.0.2 Fine
def get_models():
models = dict()
sub = [0.7, 0.75, 0.8, 0.85, 0.9]
for s in sub:
key = '%.2f' % s
models[key] = xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
eval_metric=f1_score,
subsample = s,
random_state=1)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y, sample_weights):
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
fit_params = {"sample_weight": sample_weights}
# evaluate the model and collect the results
scores = (cross_val_score(model, X, y, fit_params = fit_params, scoring= 'f1_weighted', cv=cv, n_jobs=-1))
return scores
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
# evaluate the model
scores = evaluate_model(model, X_train, y_train, weight)
# store the results
results.append(scores)
names.append(name)
# summarize the performance along the way
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
plt.boxplot(results, labels=names, showmeans=True)
plt.ylabel('Cross validation error',fontsize=15)
plt.xlabel('reg_lambda',fontsize=15)>0.70 0.592 (0.013)
>0.75 0.581 (0.013)
>0.80 0.582 (0.012)
>0.85 0.599 (0.009)
>0.90 0.600 (0.017)
Text(0.5, 0, 'reg_lambda')
Seems like the optimal range for coarse subsample is [0.7, 0.85, 0.9]
4.6 Model Performance
start_time = time.time()
param_grid = {'n_estimators':[80],
'max_depth': [8, 9],
'learning_rate': [0.2, 0.3, 0.35],
'gamma': [0.07, 0.08],
'reg_lambda':[1.0, 2.5, 3.0],
'subsample': [0.7, 0.85, 0.9]}
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax',
num_class=7,
random_state=1,
use_label_encoder=False),
param_grid = param_grid,
scoring = 'f1_weighted',
verbose = 1,
n_jobs= -1,
cv = cv)
optimal_params.fit(X_train, y_train, sample_weight = weight)
print(optimal_params.best_params_,optimal_params.best_score_)
print("Time taken = ", (time.time()-start_time)/60, " minutes")Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'gamma': 0.08, 'learning_rate': 0.2, 'max_depth': 8, 'n_estimators': 80, 'reg_lambda': 1.0, 'subsample': 0.7} 0.601835394117807
Time taken = 4.485071778297424 minutes
model = xgb.XGBClassifier(objective = 'multi:softmax',random_state=1, gamma=0.06, learning_rate = 0.3, max_depth=9,
n_estimators = 150, reg_lambda = 1.0, subsample = 0.8, num_class= 7, eval_metric=f1_score)model.fit(X_train, y_train, sample_weight = weight)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False,
eval_metric=<function f1_score at 0x00000295BED9F3A0>,
feature_types=None, gamma=0.06, gpu_id=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.3, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=9,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=150, n_jobs=None,
num_class=7, num_parallel_tree=None, objective='multi:softmax', ...)
model.score(X_test, y_test)0.6438775510204081
4.6.1 It does not seem like log tranform improves the model as the F1 score is lower than that without variable transformation.
5 CONCLUSION: XGBoost Results
In conclusion, when using XGBoost, it seems that simply tuning the hyperparameters and adjusting for class weights was the best method for increasing the F1 score of the model. The highest F1 score achieved was a score of 65.6%.